#!/usr/bin/env perl

##########################################################################
# Extracts paradigm from newline-separated list of word forms            #
# using a strategy of extracting the longest common subsequence          #
# shared by all the forms in the paradigm.                               #
#                                                                        #
# Needs (1) foma installed, (2) extract.foma in the current directory    #
#                                                                        #
# Options: -p print full newline-separated forms instead of compact form #
#          -a print all possible paradigms that fit the MLCS pattern     #
#          -u collapse similar paradigms                                 #
#          -d print debug info to stderr                                 #
#                                                                        #
# Example input:                                                         #
#  ring                                                                  #
#  rang                                                                  #
#  rung                                                                  #
#  rings                                                                 #
#  ringing                                                               #
#                                                                        #
# Example output:                                                        #
#  1+"i"+2#1+"a"+2#1+"u"+2#1+"i"+2+"s"#1+"i"+2+"ing"    0=ring,1=r,2=ng  #
#                                                                        #
# with the -p flag (useful for debugging) the output is:                 #
#  r+"i"+ng                                                              #
#  r+"a"+ng                                                              #
#  r+"u"+ng                                                              #
#  r+"i"+ng+"s"                                                          #
#                                                                        #
# Several patterns may be sent (newline-separated), which produces an    #
# output line for each pattern.                                          #
# The -u flag collapses identical paradigms; however, the variable parts #
# are retained. For example, if the input is:                            #
#                                                                        #
# trång                                                                  #
# trängre                                                                #
#                                                                        #
# lång                                                                   #
# längre                                                                 #
#                                                                        #
# The output with the -u flag will be:                                   #
# 1+"å"+2#1+"ä"+2+"re"    0=trång,1=tr,2=ng#0=lång,1=l,2=ng              #
#                                                                        #
# MH20130923                                                             #
##########################################################################

#use open qw/:std :utf8/;
use FileHandle;
use IPC::Open2;
use Getopt::Std;
use File::Basename;
use Cwd 'abs_path';
use Encode;

my $dirname = dirname(abs_path($0));

getopts("apud", \%opts);

if (defined($opts{a})) { $PRINT_ALL = 1; } else { $PRINT_ALL = 0; }
if (defined($opts{p})) { $PRINT_PARADIGM = 1; } else { $PRINT_PARADIGM = 0; }
if (defined($opts{u})) { $COLLAPSE_PARADIGMS = 1; } else { $COLLAPSE_PARADIGMS = 0; }
if (defined($opts{d})) { $DEBUG = 1; } else { $DEBUG = 0; }

while (<STDIN>) {
    chomp;
    if ($_ eq "") {
	if (@wordforms) {
	    push @inputpatterns, join('#', @wordforms); # create array of patterns, each entry in pattern #-separated
	}
	undef @wordforms;
	next;
    }
    ($wordform, $dummy) = split /\t/ ;
    if ($wordform =~ /,?/) {
	@wfsalt = split /(?<=\,)/m, $wordform ; 
	push @wordforms, @wfsalt;
    } else {
	push @wordforms, $wordform;
    }
}

if (@wordforms) {
    push @inputpatterns, join('#', @wordforms); # array of patterns
}


foreach (@inputpatterns) {

    @wordforms = split '#';
    dprint("Analyzing $wordforms[0]\n");
    
    $lcp = longest_common_prefix(@wordforms); # We cut off the LCP to speed up foma's extraction of the sequences
                                              # but add it back after the pattern is extracted
    
    my @subs = ();
    foreach $i (0 .. $#wordforms) {
	if ($lcp ne "") {
	    $wordforms[$i] =~ s/^$lcp/|/;         # Replace LCP with |
	}
	push @subs, "Subsequence({" .$wordforms[$i] ."})";
    }

    $pidMORPH = open2($Reader, $Writer, "/usr/bin/env foma -p -l $dirname/extract.foma");
    print $Writer "define MLCS Longest( " .join('&', @subs)  ." ) ;\n"
                 ."define WordSeq {" .join('#', @wordforms) ."};\n"
                 ."define RepeatedPatterns AddExtra([MarkRoot(MLCS) [%# MarkRoot(MLCS)]*]);\n"
                 ."define RepeatedPatternsEQ AddExtra(RedupN(MarkRoot(MLCS), %#));\n"
                 ."define BracketedWordSeq RandomBracketing(WordSeq);\n"
                 ."regex {PATTERN:} Markup(Filter2(Filter1(RepeatedPatterns & BracketedWordSeq)) .o. RepeatedPatternsEQ);\n"

		 ."words\n"
                 ."quit\n";

    close($Writer);

    my @patterns = ();
    $has_analysis = 1;
    while (<$Reader>) {
	if ($_ =~ /0 paths/) {
	    $has_analysis = 0;
	    last;
	}
	if ($_ =~ /^PATTERN:/) {
	    s/^PATTERN://g;
	    chomp;	    
	    push(@patterns, $_);
	}
    }
    close($Reader);
    waitpid ($pidMORPH, 0);

    # If we didn't get an analysis with the first (quick) method
    # where we don't use RepeatedPatternsEQ, we do the more thorough
    # but slower filtering, where we use RepeatedPatternsEQ from the beginning

    if ($has_analysis == 0) {
	$pidMORPH = open2($Reader, $Writer, "/usr/bin/env foma -p -l $dirname/extract.foma");
	print $Writer "define MLCS Longest( " .join('&', @subs)  ." ) ;\n"
	    ."define WordSeq {" .join('#', @wordforms) ."};\n"
	    ."define RepeatedPatternsEQ AddExtra(RedupN(MarkRoot(MLCS), %#));\n"
	    ."define BracketedWordSeq RandomBracketing(WordSeq);\n"
	    ."regex {PATTERN:} Markup(Filter2(Filter1(RepeatedPatternsEQ & BracketedWordSeq)));\n"	    
	    ."words\n"
	    ."quit\n";
	close($Writer);
	
	@patterns = ();	
	while (<$Reader>) {
	    if ($_ =~ /^PATTERN:/) {
		s/^PATTERN://g;
		chomp;	    
		push(@patterns, $_);
	    }
	}
	close($Reader);
	waitpid ($pidMORPH, 0);
    }

###

    $mincost = 99999;
    for $i (0 .. $#patterns) {
	$cost = 0;
	@words = split '#', $patterns[$i];
	foreach (@words) {
	    s/^\"[^"]+\"//g;                    # cut beginning
	    s/\+\"[^"]+\"$//g;                  # cut end
	    s/\"([^"]+)\"/"|" x length($1)/eg;  # replace each quoted sym with auxiliary |
	    s/[^|]//g;                          # remove everything else
	    $cost += length($_);                # get "gappiness" count for string
	}
	@patterncosts[$i] = $cost;
	if ($cost < $mincost) { $mincost = $cost; } # store minimum cost to use for filtering
    }
    
    $printouts = 0;
    for $i (0 .. $#patterns) {
	if ($lcp ne "") {
	    $patterns[$i] =~ s/^\|/$lcp/;   # Put back LCP
	    $patterns[$i] =~ s/#\|/#$lcp/g;
	}
	$patternstring = $patterns[$i];
	if ($patterncosts[$i] <= $mincost or $PRINT_ALL == 1) {
	    $printouts++;
	    if ($printouts > 1 and $PRINT_ALL == 1) {
		print STDERR "***WARNING: Pattern is ambiguous***\n";
	    }
	    if ($PRINT_PARADIGM == 0) {
		$vstr = paradigm_get_variables($patternstring);
		$pstr = paradigm_generalize($patternstring);
		if ($printouts < 2 or $PRINT_ALL == 1) {
		    push @outpatterns, $pstr ."\t" .$vstr;
		}
	    } else {
		if ($printouts < 2 or $PRINT_ALL == 1) {
		    $patterns[$i] =~ s/\+\",\"\#/,/g; 
		    $patterns[$i] =~ s/,\"\#/",/g;
		    @words = split '#', $patterns[$i];
		    print join("\n", @words) ."\n\n";
		}
	    }
	}
    }
}

# Join alternate forms
map {s/\+\",\"\#/,/g; } @outpatterns;
map {s/,\"\#/",/g; } @outpatterns;

if ($PRINT_PARADIGM == 0) {
    if ($COLLAPSE_PARADIGMS == 1) {
	print join ("\n", paradigm_uniq(@outpatterns)) ."\n";
    }
    else {
	print join("\n", @outpatterns) ."\n";
    }
}

sub paradigm_uniq {

   # Collapses identical paradigms
   # Pass array of paradigms and variables, and return combined paradigms
   # Example input:
   # 1#1+"are"#1+"ast" (TAB) 0=fort,1=fort    <= [array entry 1]
   # 1#1+"are"#1+"ast" (TAB) 0=tokig,1=tokig  <= [array entry 2]
   # Output:
   # 1#1+"are"#1+"ast" (TAB) 0=fort,1=fort#0=tokig,1=tokig [array entry 1]

    my @p = @_;
    my @uniqp = ();
    foreach (@p) {
	(my $lhs, my $rhs) = split /\t/;
	my $found = 0;
	for my $i (0 .. $#uniqp) {
	    (my $lhsuniq, my $rhsuniq) = split /\t/, $uniqp[$i];
	    if ($lhs eq $lhsuniq) {
		$uniqp[$i] = $lhsuniq ."\t" .$rhsuniq ."#" .$rhs;
		$found = 1;
		last;
	    }
	}
	if ($found == 0) {
	    push @uniqp, $lhs ."\t" .$rhs;
	}
    }
    return(@uniqp);
}

sub paradigm_get_variables {
    my $ptrn = shift;
    my @forms = split '#', $ptrn;
    my $varcnt = 1;
    $firstform = $forms[0];
    $fullform = $firstform;
    $fullform =~ s/,//g;
    $fullform =~ s/["+]+//g;
    $firstform = "+" .$firstform ."+";
    $firstform =~ s/\+([^+"]+)(?=\+)/"," .$varcnt++ ."=" .$1/ge;
    $firstform =~ s/\"[^"]+\"//g;
    $firstform =~ s/\+//g;
    my $varline = "0=$fullform" .$firstform;
    return($varline);
}

sub paradigm_generalize {

    # Convert a pattern of a specific type to a general one with variables, e.g.
    # ka+"t"+to#ka+to+"t"#ka+to+"n"#ka+"t"+to+"jen"#... =>
    # 1+"t"+2#1+2+"t"#1+2+"n"#1+"t"+2+"jen"#...
    # This allows for easier comparison of paradigms

    my $ptrn = shift;
    my @forms = split '#', $ptrn;
    my @newforms;
    foreach (@forms) {
	my $varcnt = 1;
	my $pstring = "+" .$_ ."+";
	$pstring =~ s/\+([^+"]+)(?=\+)/"+" .$varcnt++/ge;
	$pstring =~ s/^\+//g;
	$pstring =~ s/\+$//g;
	push @newforms, $pstring;
    }
    $genforms = join ('#', @newforms);
    return $genforms;
}

# Returns the longest common prefix of an array of strings
sub longest_common_prefix {
    my $p = shift;
    for (@_) {
	chop $p while (! /^\Q$p\E/);
    }
    # Make sure we don't have any partial utf8 hanging at the end thanks to perl
    $p =~ s/^(([\001-\177]|[\300-\337].|[\340-\357]..|[\360-\367]...)+).*/\1/;
    return $p;
}

# Debug printing
sub dprint {
    my $arg = shift;
    if ($DEBUG == 1) {
        print STDERR $arg;
    }
}
